import re
import requests
from pandas import DataFrame 
list = []

headers ={
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.46'
}
for i in range(1,11):
    page =  (i-1)*25
    url = f'https://movie.douban.com/top250?start={page}&filter='
    res = requests.get(url,headers=headers)
    # 检查请求头; 需要加上headers
    # print(res.request.headers)

    res.encoding = 'utf-8'

    obj = re.compile(r'<div class="item">.*?<span class="title">(?P<name1>.*?)</span>.*?&nbsp;/&nbsp;(?P<name2>.*?)</span>.*?<p class="">(?P<director>.*?)&nbsp;&nbsp;&nbsp;.*?<span class="inq">(?P<qoute>.*?)</span>',re.S) #re.S匹配换行符
    content = obj.finditer(res.text) 
    for item in content :
#         # name1 = item.group("name1")
#         # print(name1)
        dict = item.groupdict()
        # dict['director'] = dict['director'].strip() # 去掉字典中  director 前后的空格
        list.append(dict)
        # print(list)
        df = DataFrame(list)
        df.reset_index(drop=True)  
        df.index.names=['num']    
        df.index=df.index+1    # 序列号从1开始
        df.to_excel(r'./Test/crawler/douban_TOP250.xlsx')


